# ------------------------------------------------------------------------------
# Splits train set into two portions on which to fit separate models
# Author: Cassidy Shubatt <cshubatt@gmail.com>
# To run: bash 01_prep_split.sh
# ------------------------------------------------------------------------------

# Seeding ----------------------------------------------------------------------
set.seed(4)

# Libraries --------------------------------------------------------------------
library(here)
library(yaml)
library(data.table)
library(tidyverse)

temp <- here("code", "08_train_split_model", "temp")

# Load Data --------------------------------------------------------------------
message("Loading data...")
paths <- read_yaml(here("lib", "filepaths.yml"))
ids <- readRDS(
  file.path(
    paths$modeling$dir, "cohorts", "random", "train_cohort_tested.rds"
  )
)
ids_val <- readRDS(
  file.path(
    paths$modeling$dir, "cohorts", "random", "val_cohort.rds"
  )
)

# Split in two cohorts ---------------------------------------------------------
message("Partitioning train into two samples...")
all_encounters <- ids$ed_enc_id
sample_size <- nrow(ids)/2 %>% ceiling
sample1_encs <- sample(all_encounters, sample_size)

sample_df <- ids %>%
  mutate(sample_split1 = ed_enc_id %in% sample1_encs) %>%
  mutate(sample_split2 = !sample_split1)

message("Partitioning val into two samples...")
all_encounters <- ids_val$ed_enc_id
sample_size <- nrow(ids_val)/2 %>% ceiling
sample1_encs <- sample(all_encounters, sample_size)

sample_df_val <- ids_val %>%
  mutate(sample_split1 = ed_enc_id %in% sample1_encs) %>%
  mutate(sample_split2 = !sample_split1)

# Save -------------------------------------------------------------------------
message("Saving...")
write_rds(sample_df, file.path(temp, "split_train_cohort.rds"))
write_rds(sample_df_val, file.path(temp, "split_val_cohort.rds"))

message("Done.")
